# -*- encoding: utf-8 -*-
# @Author：lijinxi
# @Time ：2021/1/8 16:51
# @File：utils.py

# 将数据集划分为多个文件, 后来发现有更方便的方式
dataset_dict = dict()
key = ''
with open('sars-cov2.fasta', 'r', encoding='utf=8') as f:
    for line in f:
        if line.startswith('>'):
            key = line.strip()
            dataset_dict[key] = []
            continue
        dataset_dict[key].append(line.strip())

for key, val in dataset_dict.items():
    title = key.split('|')[0].replace('>', '').replace('.1', '').strip()
    with open('datasets/' + title + '.fasta', 'w', encoding='utf-8') as f:
        f.write(key + '\n')
        for line in val:
            # 替换原则
            """
            line = line.replace('R', 'AG').replace('Y', 'CT').replace('M', 'AC').replace('K', 'GT').replace('S',
                                                                                                            'CG').replace(
                'W', 'AT').replace('H', 'ACT').replace('B', 'CGT').replace('V', 'ACG').replace('D', 'AGT').replace('N',
                                                                                                                   'ACGT')
            f.write(line + '\n')  # 需要替换
"""